{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import itertools\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-09-07 17:45:10--  https://raw.githubusercontent.com/mouna99/dien/master/data.tar.gz\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 81977637 (78M) [application/octet-stream]\n",
      "Saving to: ‘data.tar.gz’\n",
      "\n",
      "100%[======================================>] 81,977,637   105MB/s   in 0.7s   \n",
      "\n",
      "2019-09-07 17:45:11 (105 MB/s) - ‘data.tar.gz’ saved [81977637/81977637]\n",
      "\n",
      "--2019-09-07 17:45:11--  https://raw.githubusercontent.com/mouna99/dien/master/data1.tar.gz\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 104430448 (100M) [application/octet-stream]\n",
      "Saving to: ‘data1.tar.gz’\n",
      "\n",
      "100%[======================================>] 104,430,448 86.5MB/s   in 1.2s   \n",
      "\n",
      "2019-09-07 17:45:13 (86.5 MB/s) - ‘data1.tar.gz’ saved [104430448/104430448]\n",
      "\n",
      "--2019-09-07 17:45:13--  https://raw.githubusercontent.com/mouna99/dien/master/data2.tar.gz\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 9460706 (9.0M) [application/octet-stream]\n",
      "Saving to: ‘data2.tar.gz’\n",
      "\n",
      "100%[======================================>] 9,460,706   --.-K/s   in 0.1s    \n",
      "\n",
      "2019-09-07 17:45:13 (95.0 MB/s) - ‘data2.tar.gz’ saved [9460706/9460706]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data.tar.gz\n",
    "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data1.tar.gz\n",
    "! wget --no-check-certificate https://raw.githubusercontent.com/mouna99/dien/master/data2.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data/\n",
      "data/cat_voc.pkl\n",
      "data/mid_voc.pkl\n",
      "data/uid_voc.pkl\n",
      "data/local_train_splitByUser\n",
      "data/local_test_splitByUser\n",
      "data1/\n",
      "data1/reviews-info\n",
      "data2/\n",
      "data2/item-info\n"
     ]
    }
   ],
   "source": [
    "! tar jxvf ./data.tar.gz && tar jxvf ./data1.tar.gz && tar jxvf ./data2.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_RUN = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = pd.read_csv(\n",
    "    \"./data/local_train_splitByUser\", sep='\\t',\n",
    "    names=['label', 'uid', 'mid', 'cat', 'hist_mids', 'hist_cats'])\n",
    "\n",
    "valid_df = pd.read_csv(\n",
    "    \"./data/local_test_splitByUser\", sep='\\t',\n",
    "    names=['label', 'uid', 'mid', 'cat', 'hist_mids', 'hist_cats'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_info_df = pd.read_csv(\"./data2/item-info\", sep='\\t', names=['mid', 'cat'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mid</th>\n",
       "      <th>cat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0001048791</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0001048775</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0001048236</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000401048</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0001019880</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          mid    cat\n",
       "0  0001048791  Books\n",
       "1  0001048775  Books\n",
       "2  0001048236  Books\n",
       "3  0000401048  Books\n",
       "4  0001019880  Books"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "item_info_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_info_df = pd.read_csv(\"./data1/reviews-info\", sep='\\t', names=['c1', 'mid', 'c3', 'c4'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c1</th>\n",
       "      <th>mid</th>\n",
       "      <th>c3</th>\n",
       "      <th>c4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A10000012B7CGYKOMPQ4L</td>\n",
       "      <td>000100039X</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1355616000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2S166WSCFIFP5</td>\n",
       "      <td>000100039X</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1071100800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A1BM81XB4QHOA3</td>\n",
       "      <td>000100039X</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1390003200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A1MOSTXNIO5MPJ</td>\n",
       "      <td>000100039X</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1317081600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A2XQ5LZHTD4AFT</td>\n",
       "      <td>000100039X</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1033948800</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      c1         mid   c3          c4\n",
       "0  A10000012B7CGYKOMPQ4L  000100039X  5.0  1355616000\n",
       "1         A2S166WSCFIFP5  000100039X  5.0  1071100800\n",
       "2         A1BM81XB4QHOA3  000100039X  5.0  1390003200\n",
       "3         A1MOSTXNIO5MPJ  000100039X  5.0  1317081600\n",
       "4         A2XQ5LZHTD4AFT  000100039X  5.0  1033948800"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews_info_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "reviews_info_df = reviews_info_df[['mid']].merge(item_info_df, on='mid', how='inner').drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mid</th>\n",
       "      <th>cat</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000100039X</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>0001055178</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>224</th>\n",
       "      <td>0001473123</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>240</th>\n",
       "      <td>0001473727</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>247</th>\n",
       "      <td>0001473905</td>\n",
       "      <td>Books</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            mid    cat\n",
       "0    000100039X  Books\n",
       "206  0001055178  Books\n",
       "224  0001473123  Books\n",
       "240  0001473727  Books\n",
       "247  0001473905  Books"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reviews_info_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "mid_cat_map = reviews_info_df.set_index('mid').to_dict()['cat']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "if TEST_RUN:\n",
    "    train_df = train_df.sample(1000)\n",
    "    valid_df = valid_df.sample(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# slow implement\n",
    "def prepare_neg(df):\n",
    "    records = df['hist_mids'].apply(lambda x: x.split(\"\u0002\"))\n",
    "    candidates = list(mid_cat_map.keys())\n",
    "    max_len = len(candidates)\n",
    "\n",
    "    def neg_sampling(filters, length):    \n",
    "        mids = []\n",
    "        cats = []\n",
    "        for i in range(length):\n",
    "            while(1):\n",
    "                c = candidates[np.random.randint(0, max_len)]\n",
    "                if c not in filters:\n",
    "                    mids.append(c)\n",
    "                    cats.append(mid_cat_map[c])\n",
    "                    filters.add(c)\n",
    "                    break\n",
    "        return mids, cats\n",
    "    \n",
    "    total_neg_mids = []\n",
    "    total_neg_cats = []\n",
    "    for record in records:\n",
    "        neg_mids, neg_cats = neg_sampling(set(record), len(record))\n",
    "        total_neg_mids.append(\"\u0002\".join(neg_mids))\n",
    "        total_neg_cats.append(\"\u0002\".join(neg_cats))\n",
    "    \n",
    "    return total_neg_mids, total_neg_cats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_neg_mids, total_neg_cats = prepare_neg(train_df)\n",
    "train_df['neg_hist_mids'] = total_neg_mids\n",
    "train_df['neg_hist_cats'] = total_neg_cats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_neg_mids, total_neg_cats = prepare_neg(valid_df)\n",
    "valid_df['neg_hist_mids'] = total_neg_mids\n",
    "valid_df['neg_hist_cats'] = total_neg_cats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df.to_csv('local_train.csv', sep='\\t', index=False)\n",
    "valid_df.to_csv('local_test.csv', sep='\\t', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
